{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading CSV file from Amazon S3 into iguazio file system or database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import v3io_frames as v3f\n",
    "import os\n",
    "client = v3f.Client('framesd:8081', container='users')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import sample file from S3 into iguazio file system (v3io)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100  861k  100  861k    0     0  2290k      0 --:--:-- --:--:-- --:--:-- 2284k\n"
     ]
    }
   ],
   "source": [
    "%%sh\n",
    "mkdir -p /v3io/${V3IO_HOME}/examples/stocks\n",
    "\n",
    "# Download a sample stocks file from Iguazio demo bucket in S3\n",
    "curl -L \"https://s3.wasabisys.com/iguazio/data/stocks/2018-03-26_BINS_XETR08.csv\" > /v3io/${V3IO_HOME}/examples/stocks/stocks_example.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read the file using into a pandas DataFrame\n",
    "Note the file can be read directly from HTTP into a DataFrame (if placing the full URL i.e. `pd.read_csv('http://deutsche-boerse...')`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = os.path.join('examples', 'stocks')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Mnemonic</th>\n",
       "      <th>SecurityDesc</th>\n",
       "      <th>SecurityType</th>\n",
       "      <th>Currency</th>\n",
       "      <th>SecurityID</th>\n",
       "      <th>Date</th>\n",
       "      <th>Time</th>\n",
       "      <th>StartPrice</th>\n",
       "      <th>MaxPrice</th>\n",
       "      <th>MinPrice</th>\n",
       "      <th>EndPrice</th>\n",
       "      <th>TradedVolume</th>\n",
       "      <th>NumberOfTrades</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ISIN</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CH0038389992</th>\n",
       "      <td>BBZA</td>\n",
       "      <td>BB BIOTECH NAM.   SF 0,20</td>\n",
       "      <td>Common stock</td>\n",
       "      <td>EUR</td>\n",
       "      <td>2504244</td>\n",
       "      <td>2018-03-26</td>\n",
       "      <td>08:00</td>\n",
       "      <td>56.4000</td>\n",
       "      <td>56.4000</td>\n",
       "      <td>56.4000</td>\n",
       "      <td>56.40</td>\n",
       "      <td>320</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CH0038863350</th>\n",
       "      <td>NESR</td>\n",
       "      <td>NESTLE NAM.        SF-,10</td>\n",
       "      <td>Common stock</td>\n",
       "      <td>EUR</td>\n",
       "      <td>2504245</td>\n",
       "      <td>2018-03-26</td>\n",
       "      <td>08:00</td>\n",
       "      <td>63.0400</td>\n",
       "      <td>63.0600</td>\n",
       "      <td>63.0000</td>\n",
       "      <td>63.06</td>\n",
       "      <td>314</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LU0378438732</th>\n",
       "      <td>C001</td>\n",
       "      <td>COMSTAGE-DAX UCITS ETF I</td>\n",
       "      <td>ETF</td>\n",
       "      <td>EUR</td>\n",
       "      <td>2504271</td>\n",
       "      <td>2018-03-26</td>\n",
       "      <td>08:00</td>\n",
       "      <td>113.4200</td>\n",
       "      <td>113.4200</td>\n",
       "      <td>113.4200</td>\n",
       "      <td>113.42</td>\n",
       "      <td>100</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LU0411075020</th>\n",
       "      <td>DBPD</td>\n",
       "      <td>XTR.SHORTDAX X2 DA.SW. 1C</td>\n",
       "      <td>ETF</td>\n",
       "      <td>EUR</td>\n",
       "      <td>2504272</td>\n",
       "      <td>2018-03-26</td>\n",
       "      <td>08:00</td>\n",
       "      <td>4.1335</td>\n",
       "      <td>4.1335</td>\n",
       "      <td>4.1295</td>\n",
       "      <td>4.13</td>\n",
       "      <td>102993</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LU0838782315</th>\n",
       "      <td>XDDX</td>\n",
       "      <td>XTR.DAX INCOME 1D</td>\n",
       "      <td>ETF</td>\n",
       "      <td>EUR</td>\n",
       "      <td>2504277</td>\n",
       "      <td>2018-03-26</td>\n",
       "      <td>08:00</td>\n",
       "      <td>105.1400</td>\n",
       "      <td>105.2000</td>\n",
       "      <td>105.1400</td>\n",
       "      <td>105.20</td>\n",
       "      <td>239</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Mnemonic               SecurityDesc  SecurityType Currency  \\\n",
       "ISIN                                                                      \n",
       "CH0038389992     BBZA  BB BIOTECH NAM.   SF 0,20  Common stock      EUR   \n",
       "CH0038863350     NESR  NESTLE NAM.        SF-,10  Common stock      EUR   \n",
       "LU0378438732     C001   COMSTAGE-DAX UCITS ETF I           ETF      EUR   \n",
       "LU0411075020     DBPD  XTR.SHORTDAX X2 DA.SW. 1C           ETF      EUR   \n",
       "LU0838782315     XDDX          XTR.DAX INCOME 1D           ETF      EUR   \n",
       "\n",
       "              SecurityID        Date   Time  StartPrice  MaxPrice  MinPrice  \\\n",
       "ISIN                                                                          \n",
       "CH0038389992     2504244  2018-03-26  08:00     56.4000   56.4000   56.4000   \n",
       "CH0038863350     2504245  2018-03-26  08:00     63.0400   63.0600   63.0000   \n",
       "LU0378438732     2504271  2018-03-26  08:00    113.4200  113.4200  113.4200   \n",
       "LU0411075020     2504272  2018-03-26  08:00      4.1335    4.1335    4.1295   \n",
       "LU0838782315     2504277  2018-03-26  08:00    105.1400  105.2000  105.1400   \n",
       "\n",
       "              EndPrice  TradedVolume  NumberOfTrades  \n",
       "ISIN                                                  \n",
       "CH0038389992     56.40           320               4  \n",
       "CH0038863350     63.06           314               3  \n",
       "LU0378438732    113.42           100               1  \n",
       "LU0411075020      4.13        102993               8  \n",
       "LU0838782315    105.20           239               3  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# read a csv file into a data frame \n",
    "df = pd.read_csv(os.path.join('/v3io', 'users', os.getenv('V3IO_USERNAME'), data_path, 'stocks_example.csv'))\n",
    "\n",
    "df.set_index('ISIN', inplace=True)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write file into iguazio database as key value table using v3io frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tablename = os.path.join(data_path, 'stocks_example_tab')\n",
    "client.write('kv', tablename, df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read and write the file using Spark DF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+--------+--------------------+------------+--------+----------+----------+-----+----------+--------+--------+--------+------------+--------------+\n",
      "|        ISIN|Mnemonic|        SecurityDesc|SecurityType|Currency|SecurityID|      Date| Time|StartPrice|MaxPrice|MinPrice|EndPrice|TradedVolume|NumberOfTrades|\n",
      "+------------+--------+--------------------+------------+--------+----------+----------+-----+----------+--------+--------+--------+------------+--------------+\n",
      "|CH0038389992|    BBZA|BB BIOTECH NAM.  ...|Common stock|     EUR|   2504244|2018-03-26|08:00|      56.4|    56.4|    56.4|    56.4|         320|             4|\n",
      "|CH0038863350|    NESR|NESTLE NAM.      ...|Common stock|     EUR|   2504245|2018-03-26|08:00|     63.04|   63.06|      63|   63.06|         314|             3|\n",
      "|LU0378438732|    C001|COMSTAGE-DAX UCIT...|         ETF|     EUR|   2504271|2018-03-26|08:00|    113.42|  113.42|  113.42|  113.42|         100|             1|\n",
      "|LU0411075020|    DBPD|XTR.SHORTDAX X2 D...|         ETF|     EUR|   2504272|2018-03-26|08:00|    4.1335|  4.1335|  4.1295|    4.13|      102993|             8|\n",
      "|LU0838782315|    XDDX|   XTR.DAX INCOME 1D|         ETF|     EUR|   2504277|2018-03-26|08:00|    105.14|   105.2|  105.14|   105.2|         239|             3|\n",
      "|DE000A0DJ6J9|     S92|SMA SOLAR TECHNOL.AG|Common stock|     EUR|   2504287|2018-03-26|08:00|     55.65|   55.65|   55.65|   55.65|         543|             3|\n",
      "|DE000A0D6554|    NDX1|      NORDEX SE O.N.|Common stock|     EUR|   2504290|2018-03-26|08:00|     8.182|    8.21|   8.174|    8.21|       10941|             8|\n",
      "|DE000A0F5UE8|    EXXU|IS.DJ CHINA OFFS....|         ETF|     EUR|   2504302|2018-03-26|08:00|     47.52|   47.52|   47.52|   47.52|         420|             1|\n",
      "|DE000A0HN5C6|    DWNI|DEUTSCHE WOHNEN S...|Common stock|     EUR|   2504314|2018-03-26|08:00|      36.2|   36.24|    36.2|   36.24|         580|             5|\n",
      "|DE000A0LD2U1|     AOX|ALSTRIA OFFICE RE...|Common stock|     EUR|   2504379|2018-03-26|08:00|     12.25|   12.25|   12.25|   12.25|        1728|             3|\n",
      "|DE000A0LR936|     ST5|           STEICO SE|Common stock|     EUR|   2504382|2018-03-26|08:00|     22.35|   22.35|   22.35|   22.35|         334|             1|\n",
      "|DE000A0MZ4B0|     DLX|DELIGNIT AG      ...|Common stock|     EUR|   2504390|2018-03-26|08:00|      10.3|    10.3|    10.3|    10.3|         850|             1|\n",
      "|DE000A0Q8NC8|    ETLX|ETFS DAXGL.G.MIN....|         ETF|     EUR|   2504397|2018-03-26|08:00|    17.844|  17.844|  17.838|  17.838|        3085|             5|\n",
      "|DE000A0V9YU8|    4RT3|ETFS COM.SEC.DZ08...|         ETC|     EUR|   2504421|2018-03-26|08:00|    5.8895|  5.8895|  5.8895|  5.8895|           0|             1|\n",
      "|DE000A0WMPJ6|    AIXA|  AIXTRON SE NA O.N.|Common stock|     EUR|   2504428|2018-03-26|08:00|      16.8|    16.8|   16.75|  16.755|        3329|             8|\n",
      "|DE000A0Z2XN6|     RIB|RIB SOFTWARE SE  ...|Common stock|     EUR|   2504436|2018-03-26|08:00|     24.66|   24.66|   24.52|   24.52|       11741|            29|\n",
      "|DE000A0Z2ZZ5|    FNTN|  FREENET AG NA O.N.|Common stock|     EUR|   2504438|2018-03-26|08:00|     24.41|   24.42|   24.41|   24.42|         695|             6|\n",
      "|DE000A1A6V48|     KSC|      KPS AG NA O.N.|Common stock|     EUR|   2504441|2018-03-26|08:00|      9.15|    9.15|    9.15|    9.15|          73|             1|\n",
      "|DE000A1DAHH0|     BNR| BRENNTAG AG NA O.N.|Common stock|     EUR|   2504453|2018-03-26|08:00|     48.14|   48.14|   48.14|   48.14|         185|             2|\n",
      "|DE000A1EWWW0|     ADS|   ADIDAS AG NA O.N.|Common stock|     EUR|   2504471|2018-03-26|08:00|     196.3|  196.35|   196.3|  196.35|         591|            12|\n",
      "+------------+--------+--------------------+------------+--------+----------+----------+-----+----------+--------+--------+--------+------------+--------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession.builder.appName(\"Iguazio file access notebook\").getOrCreate()\n",
    "\n",
    "file_path=os.path.join(os.getenv('V3IO_HOME_URL'), data_path)\n",
    "\n",
    "# Read the sample stocks.csv file into a Spark DataFrame, and let Spark infer the schema of the CSV file\n",
    "df = spark.read.option(\"header\", \"true\").csv(os.path.join(file_path, 'stocks_example.csv'))\n",
    "\n",
    "# Show the DataFrame data\n",
    "df.show()\n",
    "\n",
    "# Write the DataFrame data to a stocks_tab table under \"users\" container and define \"ISIN\" column as a key\n",
    "df.write.format(\"io.iguaz.v3io.spark.sql.kv\").mode(\"append\").option(\"key\", \"ISIN\").option(\"allow-overwrite-schema\", \"true\").save(os.path.join(file_path, 'stocks_tab_spark'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read iguazio table and writing it back as a CSV "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#myDF2 = spark.read.format(\"io.iguaz.v3io.spark.sql.kv\").load(\"v3io://users/iguazio/examples/stocks_tab_by_spark\").where(\"TradedVolume>20000\")\n",
    "myDF2 = spark.read.format(\"io.iguaz.v3io.spark.sql.kv\").load(os.path.join(file_path, 'stocks_tab_spark')).where(\"TradedVolume>20000\")\n",
    "\n",
    "# myDF2.write.csv('v3io://bigdata/examples/stocks_high_volume.csv')\n",
    "myDF2.coalesce(1).write.mode('overwrite').csv(os.path.join(file_path, 'stocks_high_volume.csv'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **Note**: Using `coalesce(1)` is for storing the output as a single file."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Viewing files \n",
    "Note: the table will apear as a directory under v3io file system"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 862\n",
      "-rw-r--r-- 1 51 nogroup 882055 Oct 18 09:17 stocks_example.csv\n",
      "drwxr-xr-x 2 51 nogroup      0 Oct 18 09:17 stocks_high_volume.csv\n",
      "drwxrwxrwx 2 51 nogroup      0 Oct 18 09:17 stocks_tab_spark\n"
     ]
    }
   ],
   "source": [
    "!ls -l /v3io/${V3IO_HOME}/examples/stocks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Remove all files and tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# clean data\n",
    "#!rm -rf /v3io/${V3IO_HOME}/examples/stocks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to release compute and memory resources taken by spark we recommend running the following command "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
